Lesson 4


Scatterplots and Perceived Audience Size

Notes:


Scatterplots

Notes:

library(ggplot2)
pf = read.csv('E:\\Study\\coursera\\Udacity\\ud651DataAnalysisWithR\\pseudo_facebook.tsv', sep = '\t')

qplot(x = age, y = friend_count, data = pf)

qplot(age, friend_count, data = pf)


What are some things that you notice right away?

Response:


ggplot Syntax

Notes:

ggplot(data = pf, aes(x = age, y = friend_count)) + geom_point()

summary(pf$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   20.00   28.00   37.28   50.00  113.00
ggplot(data = pf, aes(x = age, y = friend_count)) + geom_point() + xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).


Overplotting

Notes:

ggplot(data = pf, aes(x = age, y = friend_count)) + geom_point(alpha = 1/20) + xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).

ggplot(data = pf, aes(x = age, y = friend_count)) + geom_jitter(alpha = 1/20) + xlim(13,90)
## Warning: Removed 5178 rows containing missing values (geom_point).

What do you notice in the plot?

Response:


Coord_trans()

Notes:

ggplot(data = pf, aes(x = age, y = friend_count)) + geom_point(alpha = 1/20) + xlim(13,90) + coord_trans(y = 'sqrt')
## Warning: Removed 4906 rows containing missing values (geom_point).

ggplot(data = pf, aes(x = age, y = friend_count)) + geom_point(alpha = 1/20, position = position_jitter(h = 0)) + xlim(13,90) + coord_trans(y = 'sqrt')
## Warning: Removed 5182 rows containing missing values (geom_point).

Look up the documentation for coord_trans() and add a layer to the plot that transforms friend_count using the square root function. Create your plot!

What do you notice?


Alpha and Jitter

Notes:

ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20) + xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).

ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = 'jitter') + xlim(13,90)
## Warning: Removed 5162 rows containing missing values (geom_point).

ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0)) + xlim(13,90)
## Warning: Removed 5178 rows containing missing values (geom_point).

ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0)) + xlim(13,90) + coord_trans(y = 'sqrt')
## Warning: Removed 5178 rows containing missing values (geom_point).


Overplotting and Domain Knowledge

Notes:


Conditional Means

Notes:

#install.packages('dplyr')
library('dplyr')
## Warning: package 'dplyr' was built under R version 3.3.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#filter()
#group_by()
#mutate()
#arrange()

age_groups = group_by(pf,age)
pf.fc_by_age = summarise(age_groups, 
          friend_count_mean = mean(friend_count),
          friend_count_median = median(friend_count),
          n = n())
pf.fc_by_age = arrange(pf.fc_by_age, age)
head(pf.fc_by_age)
## Source: local data frame [6 x 4]
## 
##     age friend_count_mean friend_count_median     n
##   <int>             <dbl>               <dbl> <int>
## 1    13          164.7500                74.0   484
## 2    14          251.3901               132.0  1925
## 3    15          347.6921               161.0  2618
## 4    16          351.9371               171.5  3086
## 5    17          350.3006               156.0  3283
## 6    18          331.1663               162.0  5196

Conditional Means Alternate Code

pf.fc_by_age = pf %>%
  group_by(age) %>%
  summarise(friend_count_mean = mean(friend_count),
            friend_count_median = median(friend_count),
            n = n()) %>%
  arrange(age)

head(pf.fc_by_age)
## Source: local data frame [6 x 4]
## 
##     age friend_count_mean friend_count_median     n
##   <int>             <dbl>               <dbl> <int>
## 1    13          164.7500                74.0   484
## 2    14          251.3901               132.0  1925
## 3    15          347.6921               161.0  2618
## 4    16          351.9371               171.5  3086
## 5    17          350.3006               156.0  3283
## 6    18          331.1663               162.0  5196

Create your plot!

ggplot(data = pf.fc_by_age, aes(x = age, y = friend_count_mean)) + geom_point() 

ggplot(data = pf.fc_by_age, aes(x = age, y = friend_count_mean)) + geom_line() 


Overlaying Summaries with Raw Data

Notes:

ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') + xlim(13,90) + coord_trans(y = 'sqrt')
## Warning: Removed 5186 rows containing missing values (geom_point).

ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') + xlim(13,90) + coord_trans(y = 'sqrt') + geom_line(stat = 'summary', fun.y = mean)
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5188 rows containing missing values (geom_point).

ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') + xlim(13,90) + coord_trans(y = 'sqrt') + geom_line(stat = 'summary', fun.y = mean) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .1), linetype = 2, color = 'blue' )
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5177 rows containing missing values (geom_point).

ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') + xlim(13,90) + coord_trans(y = 'sqrt') + geom_line(stat = 'summary', fun.y = mean) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .1), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .9), linetype = 2, color = 'blue' )
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5168 rows containing missing values (geom_point).

ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') + xlim(13,90) + coord_trans(y = 'sqrt') + geom_line(stat = 'summary', fun.y = mean) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .1), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .9), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = median,  color = 'blue' )
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5189 rows containing missing values (geom_point).

ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') + xlim(13,90) + coord_trans(y = 'sqrt') + geom_line(stat = 'summary', fun.y = mean) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .1), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .9), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = median,  color = 'blue' ) + coord_cartesian(xlim = c(13,40))
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5167 rows containing missing values (geom_point).

ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') + geom_line(stat = 'summary', fun.y = mean) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .1), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .9), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = median,  color = 'blue' ) + coord_cartesian(xlim = c(13,70), ylim = c(0,1000))

What are some of your observations of the plot?

Response:


Moira: Histogram Summary and Scatterplot

See the Instructor Notes of this video to download Moira’s paper on perceived audience size and to see the final plot.

Notes:


Correlation

Notes:

cor.test(pf$age, pf$friend_count, method = 'pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737
with(pf, cor.test(age, friend_count, method = 'pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737

Look up the documentation for the cor.test function.

What’s the correlation between age and friend count? Round to three decimal places. Response:


Correlation on Subsets

Notes:

with(subset(pf, age <= 70), cor.test(age, friend_count))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1780220 -0.1654129
## sample estimates:
##        cor 
## -0.1717245
with(subset(pf, age <= 70), cor.test(age, friend_count, method = 'pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1780220 -0.1654129
## sample estimates:
##        cor 
## -0.1717245

Correlation Methods

with(subset(pf, age <= 70), cor.test(age, friend_count, method = 'spearman'))
## Warning in cor.test.default(age, friend_count, method = "spearman"): Cannot
## compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  age and friend_count
## S = 1.5782e+14, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.2552934

Create Scatterplots

Notes:

#ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') + geom_line(stat = 'summary', fun.y = mean) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .1), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .9), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = median,  color = 'blue' ) + coord_cartesian(xlim = c(13,70), ylim = c(0,1000))


ggplot(data = pf, aes(x = www_likes_received, y = likes_received)) + geom_point()

ggplot(data = pf, aes(x = www_likes_received, y = likes_received)) + geom_point() + xlim(0,quantile(pf$www_likes_received,0.95)) + ylim(0,quantile(pf$likes_received,0.95)) + geom_smooth(method = 'lm', color = 'red')
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).


Strong Correlations

Notes:

with(pf, cor.test(www_likes_received, likes_received))
## 
##  Pearson's product-moment correlation
## 
## data:  www_likes_received and likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473553 0.9486176
## sample estimates:
##       cor 
## 0.9479902

What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.

Response:


Moira on Correlation

Notes:


More Caution with Correlation

Notes:

#install.packages('alr3')
library(alr3)
## Warning: package 'alr3' was built under R version 3.3.1
## Loading required package: car
## Warning: package 'car' was built under R version 3.3.1
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
data("Mitchell")
?Mitchell
## starting httpd help server ...
##  done

Create your plot!

ggplot(data = Mitchell, aes(x = Month, y = Temp)) + geom_point()


Noisy Scatterplots

  1. Take a guess for the correlation coefficient for the scatterplot.

  2. What is the actual correlation of the two variables? (Round to the thousandths place)

cor.test(Mitchell$Temp, Mitchell$Month)
## 
##  Pearson's product-moment correlation
## 
## data:  Mitchell$Temp and Mitchell$Month
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08053637  0.19331562
## sample estimates:
##        cor 
## 0.05747063

Making Sense of Data

Notes:

ggplot(data = Mitchell, aes(x = Month, y = Temp)) + geom_point() + scale_x_continuous(breaks = 12*(1:17))

ggplot(data = Mitchell, aes(x = Month, y = Temp)) + geom_point() + scale_x_continuous(breaks =seq(0,203,12))


A New Perspective

What do you notice? Response:

Watch the solution video and check out the Instructor Notes! Notes:


Understanding Noise: Age to Age Months

Notes:

#ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') + geom_line(stat = 'summary', fun.y = mean) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .1), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .9), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = median,  color = 'blue' ) + coord_cartesian(xlim = c(13,70), ylim = c(0,1000))
# Create a new variable, 'age_with_months', in the 'pf' data frame.
# Be sure to save the variable in the data frame rather than creating
# a separate, stand-alone variable. You will need to use the variables
# 'age' and 'dob_month' to create the variable 'age_with_months'.

# Assume the reference date for calculating age is December 31, 2013.
pf$age_with_months = (12 - pf$dob_month)/12 + pf$age

ggplot(data = pf, aes(x = age_with_months, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') + geom_line(stat = 'summary', fun.y = mean) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .1), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .9), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = median,  color = 'blue' )

age_with_month_groups = group_by(pf,age_with_months)
pf.fc_by_age_months = summarise(age_with_month_groups, 
          friend_count_mean = mean(friend_count),
          friend_count_median = median(friend_count),
          n = n())
pf.fc_by_age_months = arrange(pf.fc_by_age_months)

ggplot(data = pf.fc_by_age_months, aes(x=age_with_months, y = friend_count_mean)) +geom_point()


Age with Months Means

ggplot(data = pf.fc_by_age_months, aes(x=age_with_months, y = friend_count_mean)) +geom_line() + coord_cartesian(xlim = c(13,70))

Programming Assignment


Noise in Conditional Means


Smoothing Conditional Means

Notes:

ggplot(data = subset(pf.fc_by_age_months, age_with_months <71), aes(x=age_with_months, y = friend_count_mean)) +geom_line()

ggplot(data = subset(pf.fc_by_age, age <71), aes(x=age, y = friend_count_mean)) +geom_line()

p1 = ggplot(data = subset(pf.fc_by_age_months, age_with_months <71), aes(x=age_with_months, y = friend_count_mean)) +geom_line()
p2 = ggplot(data = subset(pf.fc_by_age, age <71), aes(x=age, y = friend_count_mean)) +geom_line()
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(p2,p1,ncol=1)

p1 = ggplot(data = subset(pf.fc_by_age_months, age_with_months <71), aes(x=age_with_months, y = friend_count_mean)) +geom_line()
p2 = ggplot(data = subset(pf.fc_by_age, age <71), aes(x=age, y = friend_count_mean)) +geom_line()
p3 = ggplot(data = subset(pf, age <71), aes(x=round(age/5)*5, y = friend_count)) +geom_line(stat = 'summary', fun.y = mean)
library(gridExtra)
grid.arrange(p1,p2,p3,ncol=1)

p1 = ggplot(data = subset(pf.fc_by_age_months, age_with_months <71), aes(x=age_with_months, y = friend_count_mean)) +geom_line() + geom_smooth()
p2 = ggplot(data = subset(pf.fc_by_age, age <71), aes(x=age, y = friend_count_mean)) +geom_line() + geom_smooth()
p3 = ggplot(data = subset(pf, age <71), aes(x=round(age/5)*5, y = friend_count)) +geom_line(stat = 'summary', fun.y = mean)
library(gridExtra)
grid.arrange(p1,p2,p3,ncol=1)


Which Plot to Choose?

Notes:


Analyzing Two Variables

Reflection:


Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!